INITIAL QUALITY CONTROL

Bad quality cell filtering of n_UMI and n_genes by 0.01 and 0.99 quantiles and MT percentage of 20%

Calculate percent.mt

# Load libraries
library(Seurat)
## Loading required package: SeuratObject
## Loading required package: sp
## 
## Attaching package: 'SeuratObject'
## The following objects are masked from 'package:base':
## 
##     intersect, t
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#Load Data
Gruel= readRDS("/home/caminors/Desktop/sc_RNAseq/Raw_seurats/Gruel.rds")
SARC= readRDS("/home/caminors/Desktop/sc_RNAseq/Raw_seurats/SARC_demultiplexed.rds")
DefaultAssay(Gruel)="RNA"
DefaultAssay(SARC)="RNA"
## calculate mt percent
Gruel[["percent.mt"]] <- PercentageFeatureSet(Gruel, pattern = "^MT-", assay = "RNA")
SARC[["percent.mt"]] <- PercentageFeatureSet(SARC, pattern = "^MT-", assay = "RNA")

Plot Raw data

library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
# plotting
# Create the individual plots
plot1 <- ggplot(SARC@meta.data, aes(x=SARC$nCount_RNA, fill=SARC$orig.ident)) +
  geom_density(alpha = 0.2) + 
  scale_x_log10() + 
  theme_classic() +
  ylab("Cell density") +
  geom_vline(xintercept = 500)

plot2 <- ggplot(SARC@meta.data, aes(x=SARC$nFeature_RNA, fill=SARC$orig.ident)) +
  geom_density(alpha = 0.2) + 
  scale_x_log10() + 
  theme_classic() +
  ylab("Cell density") +
  geom_vline(xintercept = 500)

plot3 <- ggplot(Gruel@meta.data, aes(x=Gruel$nCount_RNA, fill=Gruel$orig.ident)) +
  geom_density(alpha = 0.2) + 
  scale_x_log10() + 
  theme_classic() +
  ylab("Cell density") +
  geom_vline(xintercept = 500)

plot4 <- ggplot(Gruel@meta.data, aes(x=Gruel$nFeature_RNA, fill=Gruel$orig.ident)) +
  geom_density(alpha = 0.2) + 
  scale_x_log10() + 
  theme_classic() +
  ylab("Cell density") +
  geom_vline(xintercept = 500)

# Arrange the plots in a 2x2 grid
grid.arrange(plot1, plot2, plot3, plot4, ncol = 2)

library(ggplot2)
library(dplyr)
SARC@meta.data  %>% 
  ggplot(aes(x=nCount_RNA, y=nFeature_RNA, color=percent.mt)) + 
  geom_point() + 
  scale_colour_gradient(low = "gray90", high = "black") +
  stat_smooth(method=lm) +
  scale_x_log10() + 
  scale_y_log10() + 
  theme_classic() +
  geom_vline(xintercept = 500) +
  geom_hline(yintercept = 300) +
  facet_wrap(~orig.ident)
## `geom_smooth()` using formula = 'y ~ x'

Determine 0.01 and 0.99 quantile outliers in Gruel dataset for nCount and nFeatures

library(dplyr)
library(ggplot2)

# Extract metadata from Seurat object
meta_data <- Gruel@meta.data

# Calculate quantiles and classify cells
meta_data <- meta_data %>%
  group_by(orig.ident) %>%
  mutate(
    nCount_status = case_when(
      nCount_RNA > quantile(nCount_RNA, 0.99) ~ "Upper Outlier",
      nCount_RNA < quantile(nCount_RNA, 0.01) ~ "Lower Outlier",
      TRUE ~ "Within Range"
    ),
    nFeature_status = case_when(
      nFeature_RNA > quantile(nFeature_RNA, 0.99) ~ "Upper Outlier",
      nFeature_RNA < quantile(nFeature_RNA, 0.01) ~ "Lower Outlier",
      TRUE ~ "Within Range"
    )
  ) %>%
  ungroup()

Gruel@meta.data$nCount_status = meta_data$nCount_status
Gruel@meta.data$nFeature_status = meta_data$nFeature_status
table(Gruel$nCount_status)
## 
## Lower Outlier Upper Outlier  Within Range 
##          1694          1709        166165
table(Gruel$nFeature_status)
## 
## Lower Outlier Upper Outlier  Within Range 
##          1661          1709        166198
# Violin plot for nCount_RNA with color coding for outliers
Seurat::VlnPlot(Gruel, features = c("nCount_RNA"), group.by = "orig.ident", fill.by = "nCount_status", pt.size=0) +
  geom_jitter(aes(color = Gruel@meta.data$nCount_status), size = 0.5, width = 0.2) +
  scale_color_manual(values = c("Upper Outlier" = "red", "Lower Outlier" = "blue", "Within Range" = "black")) +
  labs(title = "nCount_RNA Distribution by orig.ident with Outlier Highlights")
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`

# Violin plot for nFeature_RNA with color coding for outliers
VlnPlot(Gruel, features = c("nFeature_RNA"), group.by = "orig.ident", pt.size = 0) +
  geom_jitter(aes(color = Gruel@meta.data$nFeature_status), size = 0.5, width = 0.2) +
  scale_color_manual(values = c("Upper Outlier" = "red", "Lower Outlier" = "blue", "Within Range" = "black")) +
  labs(title = "nFeature_RNA Distribution by orig.ident with Outlier Highlights")
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`

Plot 0.01 and 0.99 quantile outliers in SARC dataset

library(dplyr)

# Extract metadata from Seurat object
meta_data <- SARC@meta.data

# Calculate quantiles and classify cells
meta_data <- meta_data %>%
  group_by(orig.ident) %>%
  mutate(
    nCount_status = case_when(
      nCount_RNA > quantile(nCount_RNA, 0.99) ~ "Upper Outlier",
      nCount_RNA < quantile(nCount_RNA, 0.01) ~ "Lower Outlier",
      TRUE ~ "Within Range"
    ),
    nFeature_status = case_when(
      nFeature_RNA > quantile(nFeature_RNA, 0.99) ~ "Upper Outlier",
      nFeature_RNA < quantile(nFeature_RNA, 0.01) ~ "Lower Outlier",
      TRUE ~ "Within Range"
    )
  ) %>%
  ungroup()

SARC@meta.data$nCount_status = meta_data$nCount_status
SARC@meta.data$nFeature_status = meta_data$nFeature_status
table(SARC$nCount_status)
## 
## Lower Outlier Upper Outlier  Within Range 
##           779           784         76396
table(SARC$nFeature_status)
## 
## Lower Outlier Upper Outlier  Within Range 
##           779           783         76397
# Violin plot for nCount_RNA with color coding for outliers
Seurat::VlnPlot(SARC, features = c("nCount_RNA"), group.by = "orig.ident", fill.by = "nCount_status", pt.size=0) +
  geom_jitter(aes(color = SARC@meta.data$nCount_status), size = 0.5, width = 0.2) +
  scale_color_manual(values = c("Upper Outlier" = "red", "Lower Outlier" = "blue", "Within Range" = "black")) +
  labs(title = "nCount Distribution by orig.ident with Outlier Highlights")

# Violin plot for nFeature_RNA with color coding for outliers
VlnPlot(SARC, features = c("nFeature_RNA"), group.by = "orig.ident", pt.size = 0) +
  geom_jitter(aes(color = SARC@meta.data$nFeature_status), size = 0.5, width = 0.2) +
  scale_color_manual(values = c("Upper Outlier" = "red", "Lower Outlier" = "blue", "Within Range" = "black")) +
  labs(title = "nFeature Distribution by orig.ident with Outlier Highlights")

SARC@meta.data %>%
  ggplot(aes(x = nCount_RNA, y = nFeature_RNA, color = nFeature_status)) + 
  geom_point() + 
  scale_colour_manual(values = c("Upper Outlier" = "red", 
                                 "Lower Outlier" = "blue", 
                                 "Within Range" = "grey")) + 
  stat_smooth(method = "lm") +
  scale_x_log10() + 
  scale_y_log10() + 
  theme_classic() +
  geom_vline(xintercept = 500) +
  geom_hline(yintercept = 300) +
  facet_wrap(~orig.ident)
## `geom_smooth()` using formula = 'y ~ x'

VlnPlot(Gruel, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`

VlnPlot(SARC, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3, group.by = "orig.ident")

Filtering by Quantile and percent.mt <20

Gruel_f <- subset(Gruel, subset = nCount_status == "Within Range" & nFeature_RNA >300 & nFeature_status =="Within Range" & percent.mt < 20) ## setting these thresholds as they seem the best for SARCOMA dataset :) most nFeature_RNA <300 are actually not "within Range" but just in case
SARC_f <- subset(SARC, subset = nCount_status == "Within Range" & nFeature_RNA >300 & nFeature_status =="Within Range" & percent.mt < 20)

VlnPlot(Gruel_f, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`

VlnPlot(SARC_f, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3, group.by="orig.ident")

Recheck histograms of filtered objects

# Load the required package
library(gridExtra)

# Create the individual plots
plot1 <- ggplot(SARC_f@meta.data, aes(x=SARC_f$nCount_RNA, fill=SARC_f$orig.ident)) +
  geom_density(alpha = 0.2) + 
  scale_x_log10() + 
  theme_classic() +
  ylab("Cell density") +
  geom_vline(xintercept = 500)

plot2 <- ggplot(SARC_f@meta.data, aes(x=SARC_f$nFeature_RNA, fill=SARC_f$orig.ident)) +
  geom_density(alpha = 0.2) + 
  scale_x_log10() + 
  theme_classic() +
  ylab("Cell density") +
  geom_vline(xintercept = 500)

plot3 <- ggplot(Gruel_f@meta.data, aes(x=Gruel_f$nCount_RNA, fill=Gruel_f$orig.ident)) +
  geom_density(alpha = 0.2) + 
  scale_x_log10() + 
  theme_classic() +
  ylab("Cell density") +
  geom_vline(xintercept = 500)

plot4 <- ggplot(Gruel_f@meta.data, aes(x=Gruel_f$nFeature_RNA, fill=Gruel_f$orig.ident)) +
  geom_density(alpha = 0.2) + 
  scale_x_log10() + 
  theme_classic() +
  ylab("Cell density") +
  geom_vline(xintercept = 500)

# Arrange the plots in a 2x2 grid
grid.arrange(plot1, plot2, plot3, plot4, ncol = 2)

SARC_f@meta.data  %>% 
    ggplot(aes(x=nCount_RNA, y=nFeature_RNA, color=percent.mt)) + 
    geom_point() + 
    scale_colour_gradient(low = "gray90", high = "black") +
    stat_smooth(method=lm) +
    scale_x_log10() + 
    scale_y_log10() + 
    theme_classic() +
    geom_vline(xintercept = 500) +
    geom_hline(yintercept = 300) +
    facet_wrap(~orig.ident)
## `geom_smooth()` using formula = 'y ~ x'